#!/usr/bin/Rscript

## FOR testing limit records to retieve from SQLite
retrieve = -1

# Databases ## IMPORTANT <- Facebook, Meetup and Forum go first, to create list of duplicated names (to be reamoved from the other matching algos)
databases <- list()
databases[['bg_facebook']] <-
    "m5s_fb_beppegrilloit_jan2015.sqlite"
databases[['m5s_facebook']] <-
    "m5s_fb_movimentocinquestelle_jan2015.sqlite"
databases[['pf_facebook']] <-
    "m5s_fb_public_figures_mar2015.sqlite"

databases[['meetup']] <-
    "m5s_meetup_jun2015.sqlite"

databases[['forum']] <-
    "m5s_forum_apr2015.sqlite"

databases[['blog']] <-
    "m5s_blog_mar2015.sqlite"

week_vector <- seq(from = as.Date("2005-02-01"), to = as.Date("2015-01-31"), by = 7)

require(data.table)
require(plyr)
require(stringi)

sqLiteConnect <- function(database, table) {
  require(DBI)
  con <- dbConnect(RSQLite::SQLite(), dbname = database)
  query <- dbSendQuery(con, paste("SELECT * FROM ", table, ";", sep="")) 
  result <- fetch(query, n = -1)
  dbClearResult(query)
  dbDisconnect(con)
  return(result)
}



                                        # Summary/Descritive statistics
labels <- names(databases)
sum_stats <- list()
timeseries <- list()


join_user_activity_df <- data.table(source = character(),
                                    label_id = character(),
                                    name = character(),
                                    date = numeric(),
                                    activity = factor()) # PIK to be assign at the end
activity_types <- c("post","comment","like","rsvp","join")

non_unique_names <- character()

for (label in labels) {
    print(label)
    if (label == "blog") {

        # Add non_unique_names dervived from cross-set comparison of Facebook users
        tmp_df <- unique(subset(join_user_activity_df, grepl('facebook', source), select=c("label_id", "name")))
        non_unique_names <- c(non_unique_names, tmp_df$name[duplicated(tolower(tmp_df$name))])
        non_unique_names <- unique(non_unique_names)
        rm(tmp_df)
        
        post_tbl <- sqLiteConnect(databases[[label]], "post", retrieve)
        comment_tbl <- sqLiteConnect(databases[[label]], "comment", retrieve)
        
        sum_stats[[label]] <- list()
        sum_stats[[label]][['posts']] <- nrow(post_tbl)
        sum_stats[[label]][['comments']] <- nrow(comment_tbl)

        timeseries[[label]] <- list()
        timeseries[[label]][['post']] <- as.data.frame(table(cut(as.Date(post_tbl$date), breaks = week_vector)))
        timeseries[[label]][['comment']] <- as.data.frame(table(cut(as.Date(comment_tbl$created), breaks = week_vector)))

        blog_user_activity_df <- data.table(source = label,
                                            label_id = as.numeric(interaction(tolower(comment_tbl$username), drop = TRUE)),
                                            name = tolower(comment_tbl$username),
                                            n_token = stri_count(comment_tbl$username, regex="\\S+"),
                                            date = as.numeric(as.POSIXct(comment_tbl$created)),
                                            activity = "comment"
                                            )

        blog_user_activity_df$label_id[blog_user_activity_df$name %in% tolower(non_unique_names)] <-  NA

        # IF compose by only one word drop unique ID and assign NA
        blog_user_activity_df$label_id[blog_user_activity_df$n_token < 2] <-  NA
        blog_user_activity_df$n_token <- NULL
        
        join_user_activity_df <- rbind(join_user_activity_df, blog_user_activity_df)

        print("Saving...")
        save(sum_stats, timeseries, file = "02_06_m5s_all_data_src_sumstat_ts.RData")
        save(join_user_activity_df, file = "02_06_m5s_join_user_activity_df.RData")
        save(non_unique_names, file = "02_06_m5s_join_user_non_unique_names.RData")
        rm(blog_user_activity_df, post_tbl, comment_tbl)    
    } else if (grepl('facebook', label)) {

        sum_stats[[label]] <- list()
        timeseries[[label]] <- list()
        
        post_tbl <- sqLiteConnect(databases[[label]], "post", retrieve)
        post_tbl <- post_tbl[,c("created_time","from","id")]
        post_tbl <- rename(post_tbl, c("id"="post_id", "from"="profile_id"))
        sum_stats[[label]][['posts']] <- nrow(post_tbl)
        timeseries[[label]][['post']] <- as.data.frame(table(cut(as.Date(post_tbl$created_time),
                                                                 breaks = week_vector)))
        post_tbl <- data.table(post_tbl)

        # HERE I remove few rows based on the assumption that low scoped-id my not be uniquely assigned to users... 
        # You will need to test the fact that this is true...
        post_tbl <- subset(post_tbl, !(grepl('^10000', profile_id)))
        
        setkeyv(post_tbl, c("post_id","profile_id"))
        
        profile_tbl <- sqLiteConnect(databases[[label]], "profile", retrieve)
        profile_tbl <- profile_tbl[,c("id","name")]
        profile_tbl <- rename(profile_tbl, c("id"="profile_id"))
        sum_stats[[label]][['profiles']] <- nrow(profile_tbl)
        profile_tbl <- data.table(profile_tbl)

        profile_tbl <- subset(profile_tbl, !(grepl('^10000', profile_id)))
        
        setkey(profile_tbl, "profile_id")

        # Add to non unique names
        non_unique_names <- c(non_unique_names, profile_tbl$name[duplicated(tolower(profile_tbl$name))])
        non_unique_names <- unique(non_unique_names)
        
        
        # Posting
        post_profile <- merge(post_tbl, profile_tbl)

        join_user_activity_df <- rbind(join_user_activity_df,
                                       data.table(source = label,
                                                  label_id = post_profile$profile_id,
                                                  name = tolower(post_profile$name),
                                                  date = as.numeric(as.POSIXct(post_profile$created_time)),
                                                  activity = "post"
                                                  )
                                       )
        rm(post_profile)
        
        # Commenting
        comment_tbl <- sqLiteConnect(databases[[label]], "comment", retrieve)
        comment_tbl <- comment_tbl[,c("created_time","from")]
        comment_tbl <- rename(comment_tbl, c("from"="profile_id"))
        sum_stats[[label]][['comments']] <- nrow(comment_tbl)
        timeseries[[label]][['comments']] <- as.data.frame(table(cut(as.Date(comment_tbl$created_time),
                                                                 breaks = week_vector)))
        comment_tbl <- data.table(comment_tbl)
        comment_tbl <- subset(comment_tbl, !(grepl('^10000', profile_id)))
        setkey(comment_tbl, "profile_id")

        comment_profile <- merge(comment_tbl, profile_tbl)
        join_user_activity_df <- rbind(join_user_activity_df,
                                       data.table(source = label,
                                                  label_id = comment_profile$profile_id,
                                                  name = tolower(comment_profile$name),
                                                  date = as.numeric(as.POSIXct(comment_profile$created_time)),
                                                  activity = "comment"
                                                  )
                                       )
        rm(comment_profile)
        
        # Liking
        post_likes_tbl <- sqLiteConnect(databases[[label]], "post_likes", retrieve)
        post_likes_tbl <- post_likes_tbl[,c("post_id","profile_id")]
        sum_stats[[label]][['post_likes']] <- nrow(post_likes_tbl)
        
        post_likes_tbl <- data.table(post_likes_tbl)
        post_likes_tbl <- subset(post_likes_tbl, !(grepl('^10000', profile_id)))
        setkey(post_likes_tbl, "profile_id")

        post_likes_profile <- merge(post_likes_tbl, profile_tbl)
        setkey(post_likes_profile, "post_id")
        post_likes_profile <- merge(post_likes_profile, 
                                    subset(post_tbl, select=c("post_id","created_time")))

        timeseries[[label]][['post_likes']] <- as.data.frame(table(cut(as.Date(post_likes_profile$created_time),
                                                                 breaks = week_vector)))
        
        join_user_activity_df <- rbind(join_user_activity_df,
                                       data.table(source = label,
                                                  label_id = post_likes_profile$profile_id,
                                                  name = tolower(post_likes_profile$name),
                                                  date = as.numeric(as.POSIXct(post_likes_profile$created_time)),
                                                  activity = "like"
                                                  )
                                       )
        print("Saving...")
        save(sum_stats, timeseries, file = "02_06_m5s_all_data_src_sumstat_ts.RData")
        save(join_user_activity_df, file = "02_06_m5s_join_user_activity_df.RData")
        save(non_unique_names, file = "02_06_m5s_join_user_non_unique_names.RData")
        rm(post_tbl, profile_tbl, comment_tbl, post_likes_tbl, post_likes_profile)

    } else if (label == 'forum') {

        sum_stats[[label]] <- list()
        timeseries[[label]] <- list()

        # Posting
        post_tbl <- sqLiteConnect(databases[[label]], "thread", retrieve)
        post_tbl <- post_tbl[,c("createdAt","authorUrl")]
        sum_stats[[label]][['posts']] <- nrow(post_tbl)
        timeseries[[label]][['post']] <- as.data.frame(table(cut(as.Date(post_tbl$createdAt),
                                                                 breaks = week_vector)))
        post_tbl <- data.table(post_tbl)
        setkey(post_tbl, "authorUrl")

        author_tbl <- sqLiteConnect(databases[[label]], "author", retrieve)
        sum_stats[[label]][['post_authors']] <- nrow(author_tbl)
        author_tbl <- author_tbl[,c("name","url")]
        author_tbl <- rename(author_tbl, c("url"="authorUrl"))
        author_tbl <- data.table(author_tbl)
        setkey(author_tbl, "authorUrl")

        non_unique_names <- c(non_unique_names, author_tbl$name[duplicated(tolower(author_tbl$name))])
        non_unique_names <- unique(non_unique_names)

        post_author <- merge(post_tbl, author_tbl)
        post_author <- subset(post_author, authorUrl!="")

        join_user_activity_df <- rbind(join_user_activity_df,
                                       data.table(source = label,
                                                  label_id = post_author$authorUrl,
                                                  name = tolower(post_author$name),
                                                  date = as.numeric(as.POSIXct(post_author$createdAt, , format="%Y-%m-%dT%H:%M:%S")),
                                                  activity = "post"
                                                  )
                                       )


        rm(post_tbl, author_tbl, post_author)
        
        # Commenting
        post_tbl <- sqLiteConnect(databases[[label]], "comment", retrieve)
        post_tbl <- post_tbl[,c("createdAt","authorUrl")]
        sum_stats[[label]][['comments']] <- nrow(post_tbl)
        timeseries[[label]][['comment']] <- as.data.frame(table(cut(as.Date(post_tbl$createdAt, , format="%Y-%m-%dT%H:%M:%S"),
                                                                    breaks = week_vector)))
        post_tbl <- data.table(post_tbl)
        setkey(post_tbl, "authorUrl")

        author_tbl <- sqLiteConnect(databases[[label]], "commentAuthor", retrieve)
        sum_stats[[label]][['comment_authors']] <- nrow(author_tbl)
        author_tbl <- author_tbl[,c("name","authorUrl")]
        author_tbl <- data.table(author_tbl)
        setkey(author_tbl, "authorUrl")

        post_author <- merge(post_tbl, author_tbl)
        sum_stats[[label]][['anonymous_comments']] <- 
          nrow(subset(post_author, authorUrl==""))
        post_author <- subset(post_author, authorUrl!="")


        forum_user_activity_df <- data.table(source = label,
                                             label_id = as.numeric(interaction(tolower(post_author$name), drop = TRUE)),
                                             original_id = post_author$authorUrl,
                                             name = tolower(post_author$name),
                                             n_token = stri_count(post_author$name, regex="\\S+"),
                                             date = as.numeric(as.POSIXct(post_author$createdAt)),
                                             activity = "post" # <-- This may change in the future to 'comment'
                                             )

        forum_user_activity_df$label_id[forum_user_activity_df$name %in% tolower(non_unique_names)] <-  NA
        
        # IF compose by only one word drop unique ID and assign NA
        forum_user_activity_df$label_id[forum_user_activity_df$n_token < 2] <-  NA

        logical <- grepl('disqus', forum_user_activity_df$original_id) &
            !(grepl('guest', forum_user_activity_df$original_id))
        
        forum_user_activity_df$label_id[logical] <- forum_user_activity_df$original_id[logical]
        forum_user_activity_df$n_token <- NULL
        forum_user_activity_df$original_id <- NULL

        join_user_activity_df <- rbind(join_user_activity_df, forum_user_activity_df)
                                                
        print("Saving...")
        save(sum_stats, timeseries, file = "02_06_m5s_all_data_src_sumstat_ts.RData")
        save(join_user_activity_df, file = "02_06_m5s_join_user_activity_df.RData")
        save(non_unique_names, file = "02_06_m5s_join_user_non_unique_names.RData")
        rm(post_tbl, author_tbl, post_author)   

    } else if (label == 'meetup'){

        sum_stats[[label]] <- list()
        timeseries[[label]] <- list()

        # Groups
        group_tbl <- sqLiteConnect(databases[[label]], "[group]", retrieve)
        group_tbl <- group_tbl[,c("group_id","created")]
        sum_stats[[label]][['groups']] <- nrow(group_tbl)
        timeseries[[label]][['group']] <- as.data.frame(table(cut(as.Date(group_tbl$created),
                                                                  breaks = week_vector)))

        rm(group_tbl)

        # Event
        event_tbl <- sqLiteConnect(databases[[label]], "event", retrieve)
        event_tbl <- event_tbl[,c("event_id","time")]
        sum_stats[[label]][['events']] <- nrow(event_tbl)
        timeseries[[label]][['event']] <- as.data.frame(table(cut(as.Date(event_tbl$time),
                                                                  breaks = week_vector)))

        rm(event_tbl)

        
        # Members
        member_tbl <- sqLiteConnect(databases[[label]], "member", retrieve)
        member_tbl <- member_tbl[,c("joined", "member_id", "name")]
        sum_stats[[label]][['members']] <- nrow(member_tbl)
        timeseries[[label]][['member']] <- as.data.frame(table(cut(as.Date(member_tbl$joined),
                                                                   breaks = week_vector)))
        member_tbl <- data.table(member_tbl)
        setkey(member_tbl, "member_id")

        non_unique_names <- c(non_unique_names, member_tbl$name[duplicated(tolower(member_tbl$name))])
        non_unique_names <- unique(non_unique_names)

        join_user_activity_df <- rbind(join_user_activity_df,
                                       data.table(source = rep(label, sum_stats[[label]][['members']]),
                                                  label_id = member_tbl$member_id,
                                                  name = gsub("^\\s+|\\s+$", "", tolower(member_tbl$name)),
                                                  date = as.numeric(as.POSIXct(member_tbl$joined)),
                                                  activity = rep("join", sum_stats[[label]][['members']])
                                                  )
                                       )

        rsvps_tbl <- sqLiteConnect(databases[[label]], "rsvps", retrieve)
        rsvps_tbl <- rsvps_tbl[,c("member_id", "created")]
        sum_stats[[label]][['rsvps']] <- nrow(rsvps_tbl)
        timeseries[[label]][['rsvps']] <- as.data.frame(table(cut(as.Date(rsvps_tbl$created),
                                                                  breaks = week_vector)))
        rsvps_tbl <- data.table(rsvps_tbl)
        setkey(rsvps_tbl, "member_id")
        
        member_rsvp <- merge(rsvps_tbl, member_tbl)
        join_user_activity_df <- rbind(join_user_activity_df,
                                       data.table(source = rep(label, nrow(member_rsvp)),
                                                  label_id = member_rsvp$member_id,
                                                  name = gsub("^\\s+|\\s+$", "", tolower(member_rsvp$name)),
                                                  date = as.numeric(as.POSIXct(member_rsvp$created)),
                                                  activity = rep("rsvp", nrow(member_rsvp))
                                                  )
                                       )
        print("Saving...")
        save(sum_stats, timeseries, file = "02_06_m5s_all_data_src_sumstat_ts.RData")
        save(join_user_activity_df, file = "02_06_m5s_join_user_activity_df.RData")
    }
}

join_user_activity_df$pik <- seq(1, nrow(join_user_activity_df))
join_user_activity_df$activity <- factor(join_user_activity_df$activity, levels=activity_types)
join_user_activity_df$source <- factor(join_user_activity_df$source, levels=labels)

save(join_user_activity_df, file = "02_06_m5s_join_user_activity_df.RData")
save(non_unique_names, file = "02_06_m5s_join_user_non_unique_names.RData")
